Filter criteria:

glue('Number of genes: ', nrow(datExpr), '\n',
     'Number of samples: ', ncol(datExpr), ' (', sum(datMeta$Diagnosis_=='ASD'), ' ASD, ',
     sum(datMeta$Diagnosis_!='ASD'), ' controls)')
## Number of genes: 6417
## Number of samples: 86 (51 ASD, 35 controls)

Dimensionality reduction using PCA

First principal component explains over 90% of the total variance

reduce_dim_datExpr = function(datExpr, datMeta, var_explained=0.98){
  
  datExpr = data.frame(datExpr)
  
  datExpr_pca = prcomp(datExpr, scale.=TRUE)
  par(mfrow=c(1,2))
  plot(summary(datExpr_pca)$importance[2,], type='b')
  plot(summary(datExpr_pca)$importance[3,], type='b')
  abline(h=var_explained, col='blue')
  
  last_pc = data.frame(summary(datExpr_pca)$importance[3,]) %>% rownames_to_column(var='ID') %>% 
    filter(.[[2]] >= var_explained) %>% top_n(-1, ID)
  
  print(glue('Keeping top ', substr(last_pc$ID, 3, nchar(last_pc$ID)), ' components that explain ',
             var_explained*100, '% of the variance'))
  
  datExpr_top_pc = datExpr_pca$x %>% data.frame %>% dplyr::select(PC1:last_pc$ID)
  
  return(list('datExpr'=datExpr_top_pc, 'pca_output'=datExpr_pca))
}

reduce_dim_output = reduce_dim_datExpr(datExpr, datMeta)

## Keeping top 12 components that explain 98% of the variance
datExpr_redDim = reduce_dim_output$datExpr
pca_output = reduce_dim_output$pca_output

rm(datSeq, datProbes, reduce_dim_datExpr, reduce_dim_output, datExpr)

Genes seem to have separated into two clouds of points:

datExpr_redDim %>% ggplot(aes(x=PC1, y=PC2)) + geom_point() + theme_minimal()

Projecting all the original points into the space created by the two principal components and colouring by the differential expression p-value we can see that the points in the middle of the two clouds were filtered out because their DE wasn’t statistically significant

load('./../working_data/RNAseq_ASD_4region_normalized.Rdata')

# Remove Subject with ID = 'AN03345'
keep = datMeta$Subject_ID!='AN03345'
datMeta = datMeta[keep,]
datExpr = datExpr[,keep]

# Calculate differential expression p-value of each gene
mod = model.matrix(~ Dx, data=datMeta)
corfit = duplicateCorrelation(datExpr, mod, block=datMeta$Subject_ID)
lmfit = lmFit(datExpr, design=mod, block=datMeta$Subject_ID, correlation=corfit$consensus)
fit = eBayes(lmfit, trend=T, robust=T)
ASD_pvals = fit$p.value[,'DxASD']

pca_data_projection = scale(datExpr) %*% pca_output$rotation %>% data.frame
pca_data_projection %>% ggplot(aes(x=PC1, y=PC2, color=ASD_pvals)) + geom_point(alpha=0.5) + theme_minimal()

rm(mod, corfit, lmfit, fit, ASD_pvals, datExpr, datProbes, datSeq)

Clustering

clusterings = list()

K-means Clustering

set.seed(123)
wss = sapply(1:10, function(k) kmeans(datExpr_redDim, k, iter.max=100, nstart=25,
                                      algorithm='MacQueen')$tot.withinss)
plot(wss, type='b', main='K-Means Clustering')
best_k = 4
abline(v = best_k, col='blue')

datExpr_k_means = kmeans(datExpr_redDim, best_k, nstart=25)
clusterings[['km']] = datExpr_k_means$cluster

Hierarchical Clustering

Chose k=6 as best number of clusters.

Clusters seem to be able to separate ASD and control samples pretty well and there are no noticeable patterns regarding sex, age or brain region in any cluster.

Younger ASD samples seem to be more similar to control samples than older ASD samples (pink cluster has most of the youngest samples). The yellow cluster is made of young ASD samples.

h_clusts = datExpr_redDim %>% dist %>% hclust
plot(h_clusts, hang = -1, cex = 0.6, labels=FALSE)

best_k = 5
clusterings[['hc']] = cutree(h_clusts, best_k)

Consensus Clustering

Samples are grouped into two big clusters and then each cluster in 4 and 6 subclusters, respectively. The first big separation into two clusters is very clear, the subclusters not so much.

*Output plots in clustering_genes_03_15 folder

Independent Component Analysis

Following this paper’s guidelines:

  1. Run PCA and keep enough components to explain 60% of the variance

  2. Run ICA with that same number of nbComp as principal components kept to then filter them

  3. Select components with kurtosis > 3

  4. Assign genes to clusters with FDR<0.001 using the fdrtool package

ICA_output = datExpr_redDim %>% runICA(nbComp=ncol(datExpr_redDim), method='JADE')
signals_w_kurtosis = ICA_output$S %>% data.frame %>% dplyr::select(names(apply(ICA_output$S, 2, kurtosis)>3))
ICA_clusters = apply(signals_w_kurtosis, 2, function(x) fdrtool(x, plot=F)$qval<0.01) %>% data.frame
ICA_clusters = ICA_clusters[colSums(ICA_clusters)>1]

clusterings[['ICA_min']] = rep(NA, nrow(ICA_clusters))
ICA_clusters_names = colnames(ICA_clusters)
for(c in ICA_clusters_names) clusterings[['ICA_min']][ICA_clusters[,c]] = c

Leaves most of the observations (~75%) without a cluster:

ICA_clusters %>% rowSums %>% table
## .
##    0    1    2    3    4    5    6    7 
## 4851  991  390  131   37   13    3    1
ICA_clusters %>% mutate(cl_sum=rowSums(.)) %>% as.matrix %>% melt %>% ggplot(aes(Var2,Var1)) + 
  geom_tile(aes(fill=value)) + xlab('Clusters') + ylab('Samples') + 
  theme_minimal() + theme(axis.text.x=element_blank(), axis.ticks.x=element_blank()) + coord_flip()

WGCNA

best_power=40 but blockwiseModules only accepts powers up to 30, so 30 was used instead

best_power = datExpr_redDim %>% t %>% pickSoftThreshold(powerVector = seq(30, 50, by=1))
##    Power SFT.R.sq  slope truncated.R.sq mean.k. median.k. max.k.
## 1     30    0.637 -0.320          0.951     817       747   2090
## 2     31    0.675 -0.329          0.955     792       713   2050
## 3     32    0.700 -0.337          0.954     768       683   2020
## 4     33    0.724 -0.348          0.954     745       653   1980
## 5     34    0.745 -0.356          0.947     723       625   1950
## 6     35    0.774 -0.368          0.946     702       597   1920
## 7     36    0.793 -0.378          0.948     683       571   1890
## 8     37    0.808 -0.386          0.943     664       548   1860
## 9     38    0.823 -0.394          0.946     646       525   1830
## 10    39    0.837 -0.401          0.948     628       502   1800
## 11    40    0.857 -0.410          0.954     612       481   1770
## 12    41    0.863 -0.420          0.950     596       463   1750
## 13    42    0.874 -0.427          0.951     581       445   1720
## 14    43    0.878 -0.437          0.949     566       427   1700
## 15    44    0.890 -0.444          0.950     552       411   1670
## 16    45    0.894 -0.452          0.947     539       395   1650
## 17    46    0.892 -0.460          0.936     526       380   1630
## 18    47    0.897 -0.470          0.935     513       365   1610
## 19    48    0.901 -0.478          0.930     501       352   1580
## 20    49    0.909 -0.487          0.936     489       339   1560
## 21    50    0.916 -0.493          0.938     478       326   1540
network = datExpr_redDim %>% t %>% blockwiseModules(power=30, numericLabels=TRUE)
clusterings[['WGCNA']] = network$colors
names(clusterings[['WGCNA']]) = rownames(datExpr_redDim)

It leaves 476 genes without a cluster:

table(clusterings[['WGCNA']])
## 
##    0    1    2    3    4    5    6    7    8    9 
##  476 3651 1350  314  294  180   50   48   27   27

Gaussian Mixture Models with hard thresholding

Number of clusters that resemble more Gaussian mixtures = 32 but perhaps a smaller number could also work (maybe 5, 12 or 17)

n_clust = datExpr_redDim %>% Optimal_Clusters_GMM(max_clusters=50, criterion='BIC', plot_data=FALSE)
plot(n_clust, type='l', main='Bayesian Information Criterion to choose number of clusters')

best_k = 32
gmm = datExpr_redDim %>% GMM(best_k)
clusterings[['GMM']] = gmm$Log_likelihood %>% apply(1, which.max)

Plot of clusters with their centroids in gray

gmm_points = rbind(datExpr_redDim, setNames(data.frame(gmm$centroids), names(datExpr_redDim)))
gmm_labels = c(clusterings[['GMM']], rep(NA, best_k)) %>% as.factor
ggplotly(gmm_points %>% ggplot(aes(x=PC1, y=PC2, color=gmm_labels)) + geom_point() + theme_minimal())

Manual clustering

Separate the two clouds of points by a straight line. There seems to be a difference in the mean expression of the genes between clusters but not in their standard deviation.

manual_clusters = as.factor(as.numeric(0.08*datExpr_redDim$PC1 + 0.2 > datExpr_redDim$PC2))
datExpr_redDim %>% ggplot(aes(x=PC1, y=PC2, color=manual_clusters)) + geom_point() + 
  geom_abline(slope=0.08, intercept=0.2, color='gray') + theme_minimal()

names(manual_clusters) = rownames(datExpr_redDim)

clusterings[['Manual']] = manual_clusters

manual_clusters_data = cbind(apply(datExpr_redDim, 1, mean), apply(datExpr_redDim, 1, sd), 
                             manual_clusters) %>% data.frame
colnames(manual_clusters_data) = c('mean','sd','cluster')
manual_clusters_data = manual_clusters_data %>% mutate('cluster'=as.factor(cluster))
manual_clusters_data %>% ggplot(aes(x=mean, color=cluster, fill=cluster)) + 
  geom_density(alpha=0.4) + theme_minimal()

manual_clusters_data %>% ggplot(aes(x=sd, color=cluster, fill=cluster)) + 
  geom_density(alpha=0.4) + theme_minimal()

rm(wss, datExpr_k_means, h_clusts, cc_output, cc_output_c1, cc_output_c2, best_k, ICA_output, 
   ICA_clusters_names, signals_w_kurtosis, n_clust, gmm, gmm_points, gmm_labels, network, 
   best_power, c, manual_clusters, manual_clusters_data)

Compare clusterings

Using Adjusted Rand Index:

cluster_sim = data.frame(matrix(nrow = length(clusterings), ncol = length(clusterings)))
for(i in 1:(length(clusterings))){
  cluster1 = as.factor(clusterings[[i]])
  for(j in (i):length(clusterings)){
    cluster2 = as.factor(clusterings[[j]])
    cluster_sim[i,j] = adj.rand.index(cluster1, cluster2)
  }
}
colnames(cluster_sim) = names(clusterings)
rownames(cluster_sim) = colnames(cluster_sim)

cluster_sim = cluster_sim %>% as.matrix %>% round(2)
heatmap.2(x = cluster_sim, Rowv = FALSE, Colv = FALSE, dendrogram = 'none', 
          cellnote = cluster_sim, notecol = 'black', trace = 'none', key = FALSE, 
          cexRow = 1, cexCol = 1, margins = c(7,7))

rm(i, j, cluster1, cluster2, cluster_sim)

Scatter plots

create_2D_plot = function(cat_var){
 ggplotly(plot_points %>% ggplot(aes_string(x='PC1', y='PC2', color=cat_var)) + 
          geom_point() + theme_minimal() + 
          xlab(paste0('PC1 (', round(summary(pca_output)$importance[2,1]*100,2),'%)')) +
          ylab(paste0('PC2 (', round(summary(pca_output)$importance[2,2]*100,2),'%)')))
}
create_3D_plot = function(cat_var){
  plot_points %>% plot_ly(x=~PC1, y=~PC2, z=~PC3) %>% add_markers(color=plot_points[,cat_var], size=1) %>% 
    layout(title = glue('Samples coloured by ', cat_var),
           scene = list(xaxis=list(title=glue('PC1 (',round(summary(pca_output)$importance[2,1]*100,2),'%)')),
                        yaxis=list(title=glue('PC2 (',round(summary(pca_output)$importance[2,2]*100,2),'%)')),
                        zaxis=list(title=glue('PC3 (',round(summary(pca_output)$importance[2,3]*100,2),'%)'))))  
}

plot_points = datExpr_redDim %>% data.frame() %>% dplyr::select(PC1:PC3) %>%
              mutate(ID = rownames(.),                           km_clust = as.factor(clusterings[['km']]),
                hc_clust = as.factor(clusterings[['hc']]),       cc_l1_clust = as.factor(clusterings[['cc_l1']]),
                cc_clust = as.factor(clusterings[['cc_l2']]),    ica_clust = as.factor(clusterings[['ICA_min']]),
                n_ica_clust = as.factor(rowSums(ICA_clusters)),  gmm_clust = as.factor(clusterings[['GMM']]),
                wgcna_clust = as.factor(clusterings[['WGCNA']]), manual_clust=as.factor(clusterings[['Manual']]))

2D plots of clusterings

create_2D_plot('km_clust')
create_2D_plot('hc_clust')
create_2D_plot('cc_l1_clust')
create_2D_plot('cc_clust')
create_2D_plot('ica_clust')
create_2D_plot('gmm_clust')
create_2D_plot('wgcna_clust')

3D plots

create_3D_plot('ica_clust')
## Warning in RColorBrewer::brewer.pal(N, "Set2"): n too large, allowed maximum for palette Set2 is 8
## Returning the palette you asked for with that many colors

## Warning in RColorBrewer::brewer.pal(N, "Set2"): n too large, allowed maximum for palette Set2 is 8
## Returning the palette you asked for with that many colors